In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import csv
with open('output/themes/Boirefumersedroguer2.csv','r') as f:
    reader = csv.reader(f, delimiter="|")
    documents = []
    for r in reader:
        documents.append(r[3])            
    vectorizer =TfidfVectorizer(min_df=4, max_features = 5000)
    #fit_transform(raw_documents[, y])	Learn vocabulary and idf, return term-document matrix.
    vz = vectorizer.fit_transform(documents)
    #create a dictionary out of the feature names and their corresponding idf values
    tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
    #print("dublin: " + str(tfidf["violer"]))

        
/usr/local/lib/python2.7/dist-packages/numpy/core/fromnumeric.py:2499: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.
  VisibleDeprecationWarning)
In [ ]:
 
In [3]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, random_state=0)
svd_tfidf = svd.fit_transform(vz[:5000])
In [4]:
svd_tfidf.shape
Out[4]:
(5000, 50)
In [5]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.091206
[t-SNE] Error after 83 iterations with early exaggeration: 23.830414
[t-SNE] Error after 193 iterations: 2.843235
In [6]:
tsne_tfidf.shape
Out[6]:
(5000, 2)
In [7]:
tsne_tfidf[0]
Out[7]:
array([-5.28212969,  2.69922466])
In [8]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()
plot_tfidf = bp.figure(plot_width=900, plot_height=700, title="Ciao violences questions (tf-idf)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_tfidf.scatter(x=tsne_tfidf[:,0], y=tsne_tfidf[:,1],
                    source=bp.ColumnDataSource({
                        "question": documents[:5000]
                        
                    }))


hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"Question": "@question"}
show(plot_tfidf)
Loading BokehJS ...
Out[8]:

<Bokeh Notebook handle for In[8]>

In [9]:
from sklearn.cluster import MiniBatchKMeans

num_clusters = 6
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
                         init_size=1000, batch_size=100, verbose=False, max_iter=5000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)
In [10]:
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i)
    for j in sorted_centroids[i, :10]:
        print(' %s' % terms[j])
    print()
Cluster 0:
 ils
 fument
 pourquoi
 parents
 amis
 jeunes
 veulent
 disent
 fume
 fumer
()
Cluster 1:
 drogue
 drogues
 effets
 quels
 pourquoi
 quelle
 aimerais
 avance
 prend
 quelles
()
Cluster 2:
 fumer
 cigarette
 arreter
 joint
 fume
 xe7a
 dangereux
 temps
 joints
 voulais
()
Cluster 3:
 fume
 risque
 temps
 jour
 depuis
 ke
 joint
 joints
 arreter
 semaine
()
Cluster 4:
 cannabis
 combien
 sang
 alcool
 temps
 quel
 aimerais
 voudrais
 thc
 salut
()
Cluster 5:
 tout
 meme
 xe7a
 car
 temps
 vraiment
 depuis
 comme
 cela
 question
()
In [11]:
tsne_kmeans = tsne_model.fit_transform(kmeans_distances[:5000])
[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.004437
[t-SNE] Error after 89 iterations with early exaggeration: 18.988980
[t-SNE] Error after 329 iterations: 2.218897
In [12]:
import numpy as np

colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

plot_kmeans = bp.figure(plot_width=900, plot_height=700, title="Web Summit 2015 tweets (k-means)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_kmeans.scatter(x=tsne_kmeans[:,0], y=tsne_kmeans[:,1], 
                    color=colormap[kmeans_clusters][:10000], 
                    source=bp.ColumnDataSource({
                        "question": documents[:5000], 
                        "cluster": kmeans_clusters[:5000]
                    }))
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"Question": "@question"}
show(plot_kmeans)
Out[12]:

<Bokeh Notebook handle for In[12]>